To upload the data, download the uber_pickups_lower_manhattan_wide_6h.csv and stick it in a folder and remember the directory.
# Install required libraries if necessary.
require("forecast")
require("lubridate")
Loading required package: lubridate
Attaching package: ‘lubridate’
The following object is masked from ‘package:base’:
date
require("gridExtra")
require("tidyverse")
Loading required package: tidyverse
[37m── [1mAttaching packages[22m ──────────────────────── tidyverse 1.2.1 ──[39m
[37m[32m✔[37m [34mtibble [37m 2.1.3 [32m✔[37m [34mpurrr [37m 0.3.2
[32m✔[37m [34mtidyr [37m 1.0.0 [32m✔[37m [34mdplyr [37m 0.8.3
[32m✔[37m [34mreadr [37m 1.3.1 [32m✔[37m [34mstringr[37m 1.4.0
[32m✔[37m [34mtibble [37m 2.1.3 [32m✔[37m [34mforcats[37m 0.4.0[39m
[37m── [1mConflicts[22m ─────────────────────────── tidyverse_conflicts() ──
[31m✖[37m [34mlubridate[37m::[32mas.difftime()[37m masks [34mbase[37m::as.difftime()
[31m✖[37m [34mdplyr[37m::[32mcombine()[37m masks [34mgridExtra[37m::combine()
[31m✖[37m [34mlubridate[37m::[32mdate()[37m masks [34mbase[37m::date()
[31m✖[37m [34mdplyr[37m::[32mfilter()[37m masks [34mtimeSeries[37m::filter(), [34mstats[37m::filter()
[31m✖[37m [34mlubridate[37m::[32mintersect()[37m masks [34mbase[37m::intersect()
[31m✖[37m [34mdplyr[37m::[32mlag()[37m masks [34mtimeSeries[37m::lag(), [34mstats[37m::lag()
[31m✖[37m [34mlubridate[37m::[32msetdiff()[37m masks [34mbase[37m::setdiff()
[31m✖[37m [34mlubridate[37m::[32munion()[37m masks [34mbase[37m::union()[39m
require("caret")
Loading required package: caret
Loading required package: lattice
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: ‘caret’
The following object is masked from ‘package:purrr’:
lift
require("tscount")
Loading required package: tscount
# Input data
uber_6h <- read_csv('../data/uber_pickups_lower_manhattan_wide_6h.csv') # Point this to the directory and file
Parsed with column specification:
cols(
Pickup_date = [34mcol_datetime(format = "")[39m,
East_Village = [32mcol_double()[39m,
Gramercy = [32mcol_double()[39m,
GVillage_N = [32mcol_double()[39m,
GVillage_S = [32mcol_double()[39m,
Little_Italy = [32mcol_double()[39m,
LES = [32mcol_double()[39m,
SoHo = [32mcol_double()[39m,
Union_Sq = [32mcol_double()[39m
)
uber_train <- uber_6h %>% filter(Pickup_date < ymd_hms("2015-06-01 00:00:00")) # This gives us a training set for all 8 locations
uber_test <- uber_6h %>% filter(Pickup_date >= ymd_hms("2015-06-01 00:00:00"))
uber_train <- uber_train %>% mutate(season=weekdays(Pickup_date, abbreviate=TRUE))
uber_test <- uber_test %>% mutate(season=weekdays(Pickup_date, abbreviate=TRUE))
# nrow(uber_6h) # if we want to check number of observations
# names(uber_6h) # use this command if you want to see the available time series
Train-test split
We will use the first five months as training data and predict Uber pickups for June 2015 in the East Village. The date is in positx format which can be parsed using the lubridate package.
Statistical forecasting techniques are used on the East Village training data, and will be evaluated on East Village test data.
# This block selects only east village data and splits it into train/test sets
full_ts <- msts(uber_6h$East_Village,
start=decimal_date(ymd_hms("2015-01-01 00:00:00")),
seasonal.periods=c(4, 1461))
train_ts <- msts(uber_train$East_Village,
start=decimal_date(ymd_hms("2015-01-01 00:00:00")),
seasonal.periods=c(4, 1461))
test_ts <- msts(uber_test$East_Village,
start=decimal_date(ymd_hms("2015-06-01 00:00:00")),
seasonal.periods=c(4, 1461))
#train_ts <- window(full_ts, start=decimal_date(ymd_hms("2015-01-01 00:00:00"))) # Train with obsercations up until june
#test_ts <- window(full_ts, start=decimal_date(ymd_hms("2015-06-01 00:00:00"))) # Test out predictions with June data
y_test <- as.numeric(test_ts)
test_size <- length(y_test)
# Show full data set
p1 <- full_ts %>% autoplot(series="All Trips") +
ggtitle("Full Dataset") +
guides(colour=guide_legend("Data"))+
scale_color_manual(values=c("black"))
# Show training dataset
p2 <- autoplot(train_ts, series="Training")+
autolayer(test_ts, series="Validation")+
guides(colour=guide_legend("Split"))+
scale_color_manual(values=c("black", "grey"))+
ggtitle("Train/Test Split")
grid.arrange(p1, p2, nrow=2, ncol=1)

To look at any of the other 7 pickup locations, call lil_italy <- uber_train %>% select(Little_Italy) for example.
# Show subset of data to look for patterns
jan_to_march <- train_ts %>% window(end=decimal_date(ymd_hms("2015-03-01 00:00:00")))
jan_to_march %>% autoplot()+
ggtitle("2 Months of Uber Pickup Data")

By aggregating to 6-hour windows we model the demand per “shift” in a day.
- 00:00 - 05:59 - Graveyard Shift
- 06:00 - 11:59 - Morning Shift
- 12:00 - 17:59 - Afternoon Shift
- 18:00 - 23:59 - Evening Shift
Moving Average Filter

This helps with telling the seasonality of the data. It looks like there is a spike in demand at the beginning of each month. In addition there are about three spikes during each month. This probably corresponds to weekly spikes in demand.
Analyzing the trend
lm(train_ts ~ time(train_ts)) %>% fitted() -> yhat.lm
autoplot(train_ts) +
geom_line(mapping=aes(x=time(train_ts), y=yhat.lm), color="red")+
ggtitle("Linear Trend Fit")+
xlab("Time")+
ylab("Pickups")

There is a slight linear trend. Let’s see what the ACF and PACF plots look like.
p1 <- ggAcf(train_ts) + ggtitle("ACF of Uber Pickups")
p2 <- ggPacf(train_ts) + ggtitle("PACF of Uber Pickups")
grid.arrange(p1, p2, nrow=2, ncol=1)

Not very informative although it initially hints toward an AR(p) model after a difference.
Before checking the lag plot, the trend should be removed via differencing.
Differencing
# code for differencing the training data
train_diff <- diff(train_ts)
lm(train_diff ~ time(train_diff)) %>% fitted() -> yhat.lm
train_diff %>% autoplot() +
geom_line(mapping=aes(x=time(train_diff), y=yhat.lm), color="red") +
ggtitle("Differenced Pickup Data") +
ylab("Change in Pickups")

# Comparison of time series plots
lm(train_ts ~ time(train_ts)) %>% fitted() -> yhat.lm1
p1 <- autoplot(train_ts) +
geom_line(mapping=aes(x=time(train_ts), y=yhat.lm1), color="red")+
ggtitle("Linear Trend Fit")+
xlab("Time")+
ylab("Pickups")
# code for differencing the training data
train_diff <- diff(train_ts)
lm(train_diff ~ time(train_diff)) %>% fitted() -> yhat.lm2
p2 <- train_diff %>% autoplot() +
geom_line(mapping=aes(x=time(train_diff), y=yhat.lm2), color="red") +
ggtitle("Differenced Pickup Data") +
ylab("Change in Pickups")
grid.arrange(p1, p2, nrow=1, ncol=2)

Zoom in
diffwin <- train_diff %>% window(end=decimal_date(ymd_hms("2015-03-01 00:00:00")))
diffwin %>% autoplot() + ggtitle("2 Months of Differenced Pickups")

diffwin %>% autoplot(series="Original Data")+
autolayer(ma(diffwin, 4), series="Moving Average")+
jan + feb + mar +
guides(colour=guide_legend("Split"))+
scale_color_manual(values=c("black", "grey"))+
ggtitle("Moving Average of Training Data (Jan and Feb)")
p1 <- train_diff %>% ggAcf() + ggtitle("ACF of Differenced Uber Pickups")
p2 <- train_diff %>% ggPacf() + ggtitle("PACF of Differenced Uber Pickups")
grid.arrange(p1, p2, nrow=1, ncol=2)
train_diff %>% Acf(plot=FALSE, lag.max=30)
Autocorrelations of series ‘.’, by lag
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
1.000 -0.387 -0.168 -0.189 0.598 -0.307 -0.039 -0.118 0.274 -0.157 0.015 -0.129 0.233 -0.135 0.031 -0.125 0.220 -0.130 0.021 -0.149 0.260
21 22 23 24 25 26 27 28 29 30
-0.117 -0.025 -0.291 0.558 -0.179 -0.145 -0.343 0.855 -0.337 -0.145
# PACF of differenced data
train_diff %>% Pacf(plot=FALSE, lag.max=30)
Partial autocorrelations of series ‘.’, by lag
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
-0.387 -0.373 -0.577 0.246 -0.030 0.088 0.010 -0.163 -0.065 -0.090 -0.164 0.078 -0.058 0.005 -0.067 -0.042 -0.023 -0.068 -0.192 0.003 -0.031
22 23 24 25 26 27 28 29 30
-0.066 -0.491 0.181 0.062 -0.101 -0.448 0.428 0.114 0.145
There is clear seasonality and a sharp cutoff after lag 26-ish. Each lag is 1 shift, so 24 shifts would be 6 days of pickups, 26 shifts would be 6.5 days of pickups (evening shift of the 6th day).
In the ACF plot there is a spike at lag 4, and another lower spike at lag 8, lag 12, etc. In addition, there are large spikes at lag 28n. This indicates an AR(4) component and seasonal component at lag 28. The PACF spikes and cuts off after lag 3, and spikes again at lag 28. This further suggests an AR(4) or AR(3) model.

We looked at the lags for the differenced time series. We suspect that there is a positive autocorrelation at lag 4n because of assumed daily seasonality. The other place to check would be where the ACF’s seasonal decay exhibited spikes at lag 28n. Looking at the plot above it’s clear that this is a highly predictive lag.
Next we difference the series S=28 times for the D=1 seasonal difference. Here we can see what the order of the seasonal component would be.
train_sdiff <- train_ts %>% diff() %>% diff(lag=28)
p1 <- train_sdiff %>% ggAcf() + ggtitle("ACF of Diff(28) data")
p2 <- train_sdiff %>% ggPacf() + ggtitle("PACF of Diff(28) data")
grid.arrange(p1, p2, nrow=2, ncol=1)

# ACF of seasonal component
train_ts %>% diff() %>% diff(lag=28) %>% Acf(plot=FALSE, lag.max=30)
Autocorrelations of series ‘.’, by lag
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
1.000 -0.275 -0.206 -0.063 0.213 -0.073 -0.069 0.035 -0.008 0.006 0.002 0.010 0.014 -0.064 0.041 -0.018 0.013 -0.022 0.010 -0.026 0.042
21 22 23 24 25 26 27 28 29 30
-0.038 0.066 0.021 -0.082 -0.017 0.138 0.087 -0.406 0.078 0.148
# PACF of seasonal component
train_ts %>% diff() %>% diff(lag=28) %>% Pacf(plot=FALSE, lag.max=30)
Partial autocorrelations of series ‘.’, by lag
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
-0.275 -0.305 -0.264 0.038 -0.059 -0.058 0.005 -0.070 -0.015 0.000 -0.004 0.040 -0.056 0.005 -0.035 -0.019 -0.012 -0.021 -0.046 0.016 -0.049
22 23 24 25 26 27 28 29 30
0.064 0.082 -0.039 -0.015 0.103 0.189 -0.298 -0.129 -0.075
The ACF and PACF of the seasonal differenced data look very similar. There is a spike at lag 1 and at lag 28 (ACF) and lag 29 (PACF). Besides these two spikes, the rest of the values tend to stick around zero. This is indicative of an SMA(1) seasonal component. There doesn’t appear to be any SAR(P) component. Since we took one seasonal difference, then D=1.
We will now make an ARIMA model based on the ACF and PACF plots of the diff(1) and diff(28) version of our original data.
SARIMA Models
The first model is based on the model presented in the time series analysis book mixed with the seasonality that we observe in the Uber dataset. This is a model that is used in economics and was used to predict airline passengers in the book.
# Intuitive model based on the plots
m1 <- Arima(train_ts,
order=c(1, 1, 0),
seasonal=list(order=c(0,1,1), period=28))
yhat <- m1 %>% forecast(h=test_size)
# Plot the predictions
p1 <- yhat %>% autoplot()
p2 <- ggAcf(residuals(m1)) + ggtitle("ACF of m1 Residuals")
grid.arrange(p1, p2, nrow=1, ncol=2)

# Print out summary of the coefficients
m1 %>% summary()
Series: train_ts
ARIMA(1,1,0)(0,1,1)[28]
Coefficients:
ar1 sma1
-0.3765 -0.9039
s.e. 0.0426 0.0498
sigma^2 estimated as 26438: log likelihood=-3766.04
AIC=7538.09 AICc=7538.13 BIC=7551.15
Training set error measures:
ME RMSE MAE MPE MAPE MASE ACF1
Training set 7.162789 158.37 98.86515 Inf Inf NaN -0.06150298
The pattern looks like it was captured in the predictive mean of the forecast, but the prediction intervals get really wide for forecasts beyond about 2 weeks. At this point, the prediction interval captures negative values.
Next we use p=4 since this is what was uncovered in the ACF and PACF plots of the differenced time series.
m2 <- Arima(train_ts,
order=c(4, 1, 0),
seasonal=list(order=c(0,1,1), period=28))
yhat2 <- m2 %>% forecast(h=test_size)
p1 <- yhat2 %>% autoplot()
p2 <- ggAcf(residuals(m2)) + ggtitle("ACF of Residuals")
grid.arrange(p1, p2, nrow=1, ncol=2)

This second model dips below zero in the prediction interval after the first week. The AIC and BIC of this model are also lower than that of the first economic model.
The last model is one that is chosen by auto.arima on the basis of AIC. However, the function needs a time series with frequency defined and I struggled to understand the frequency parameter. I made a guess that it would be the number of days in a year (1 season) divided by the value of the season present in my data.
# Auto arima with forced seasonality
train_seasonal <- ts(uber_train$East_Village, frequency=365.4/28)
m_auto <- auto.arima(train_seasonal, D=1)
m_auto %>% forecast(h=test_size) %>% autoplot()

m_auto_y <- m_auto %>% forecast(h=test_size)
m_auto <- Arima(train_ts,
order=c(3, 0, 2),
seasonal=list(order=c(2,1,0), period=13))
m_auto_y <- m_auto %>% forecast(h=test_size)

This model does worse than the first two.
# Comparison of model results
p1 <- ggplot()+
geom_point(mapping=aes(x=y_test, y=yhat$mean))+
xlim(min(y_test), max(y_test))+
ylim(min(y_test), max(y_test))+
ggtitle("Actual vs. Predicted (m1)")
p2 <- ggplot()+
geom_point(mapping=aes(x=y_test, y=yhat2$mean))+
xlim(min(y_test), max(y_test))+
ylim(min(y_test), max(y_test))+
ggtitle("Actual vs. Predicted (m2)")
grid.arrange(p1, p2, nrow=1, ncol=2)

m7 <- Arima(train_ts,
order=c(4, 1, 0),
seasonal=list(order=c(0,1,4), period=28))
yhat7 <- m7 %>% forecast(h=test_size)
p1 <- yhat7 %>% autoplot()
p2 <- ggAcf(residuals(m7)) + ggtitle("ACF of Residuals")
grid.arrange(p1, p2, nrow=1, ncol=2)

BIC(m7)
[1] 7460.809
m8 <- Arima(train_ts,
order=c(3, 1, 0),
seasonal=list(order=c(0,1,1), period=28))
yhat8 <- m8 %>% forecast(h=test_size)
p1 <- yhat8 %>% autoplot()
p2 <- ggAcf(residuals(m8)) + ggtitle("ACF of Residuals")
grid.arrange(p1, p2, nrow=1, ncol=2)

Ljung-Box Tests
# Ljung Box tests
fit_test <- sarima(train_ts, p=4, d=1, q=0, P=0, D=1, Q=1, S=28)
fit_test
m1 was pretty good but didn’t pass the Ljung-Box test of significance. However m2 did. And this also assumes normally-distributed errors which isn’t exactly true with our data since it’s Poisson distributed.
Poisson Models
# One-hot encoding function for weekly seasonal variable
dmy <- dummyVars(~season, data = uber_train)
trainCovariates <- data.frame(predict(dmy ,newdata=data.frame(season=uber_train$season)))
testCovariates <- data.frame(predict(dmy ,newdata=data.frame(season=uber_test$season)))
#
m3 <- tsglm(train_ts, model=list(past_obs=1, past_mean=1), distr="poisson", xreg=trainCovariates)
At least one estimated parameter corresponding to a covariate is very close to
zero, the boundary of the parameter constraint for this parameter. This might
be an appropriate estimation but could also indicate that the parameter is
actually negative and the chosen model is too restrictive. In such a case one
could consider using a model with the logarithmic link function. Please check
results carefully.
yhat_m3 <- predict(m3, n.ahead=test_size, newxreg=testCovariates)
predint_m3 <- data.frame(yhat_m3$interval)
ggplot()+
geom_line(mapping=aes(x=time(train_ts), y=train_ts))+
geom_ribbon(mapping=aes(x=time(test_ts), ymin=predint_m3$lower, ymax = predint_m3$upper), fill="blue", alpha=0.5)+
geom_line(mapping=aes(x=time(test_ts), y=yhat_m3$median), color="blue")+
ggtitle("Poisson GLM with ARMA(1,1) and Weekday Dummy Variable")

m4 <- tsglm(train_ts, model=list(past_obs=1, past_mean=1), distr="poisson")
yhat_m4 <- predict(m4, n.ahead=test_size)
predint_m4 <- data.frame(yhat_m4$interval)
ggplot()+
geom_line(mapping=aes(x=time(train_ts), y=train_ts))+
geom_ribbon(mapping=aes(x=time(test_ts), ymin=predint_m4$lower, ymax = predint_m4$upper), fill="blue", alpha=0.5)+
geom_line(mapping=aes(x=time(test_ts), y=yhat_m4$median), color="blue")+
ggtitle("Poisson GLM with ARMA(1,1)")

mean((y_test - yhat_m6$median)^2)
[1] 29587.04
dmy <- dummyVars(~season, data = uber_train)
trainCovariates <- data.frame(predict(dmy ,newdata=data.frame(season=uber_train$season)))
testCovariates <- data.frame(predict(dmy ,newdata=data.frame(season=uber_test$season)))
m6 <- tsglm(train_ts, model=list(past_obs=4, past_mean=1), distr="poisson", xreg=trainCovariates)
predint_m6 <- data.frame(yhat_m6$interval)
yhat_m6 <- predict(m6, n.ahead=test_size, newxreg=testCovariates)
ggplot()+
geom_line(mapping=aes(x=time(train_ts), y=train_ts))+
geom_ribbon(mapping=aes(x=time(test_ts), ymin=predint_m6$lower, ymax = predint_m6$upper), fill="blue", alpha=0.5)+
geom_line(mapping=aes(x=time(test_ts), y=yhat_m6$median), color="blue")+
ggtitle("Poisson GLM ARMA(4,1) and Weekday Dummy Variable")

That works, should I include the AIC / BIC tables up here too? And the residuals plots and stuff? Alrighty, if you could just put an example section with your model and I can just copy the same format. I have plenty of BS I can stick in d:
Jk im gonna delete this so we have space haha but I’ll copy it in a separate texct file Could you include a quick writeup on it? I was gonna write up something quick about SARIMA, and William is gonna write up something about Poisson. Sounds good, would you be down to put it up here? We were trying to think of what to put up in this section (unless you have an idea of what to put . Sounds good
Our analysis considered three models:\ thats right, and multivariate generalized additive models, or VGAM’s.Sure, ill do a write up in the model fitting section, ill type out the actual models mathematically and try to explain them. yea i can write it here instead, for model fitting and validation put plots of your forecast beside the validation and talk about how well they fit the data. also probably include tables with the models and errors on the validation set. Yes, and the ljung box stuff too, the procedures you used to choose the best model should be there like the aic, but lets not go too crazy lol. I think at most we include validation error, aic, residual or Q-Q plots, and ljung box should be good. for sure, alright have fun lol
train_freq <- ts(uber_train$East_Village, frequency=28)
test_freq <- ts(uber_test$East_Village, frequency=28)
train_freq %>% auto.arima() -> m10
p1 <- m10 %>% forecast(h=120) %>% autoplot()
p2 <- m10 %>% residuals() %>% ggAcf() + ggtitle("ACF of Residuals")
grid.arrange(p1, p2, nrow=1, ncol=2)
checkresiduals(m10)
Ljung-Box test
data: Residuals from ARIMA(4,0,0)(2,1,0)[28]
Q* = 24.923, df = 50, p-value = 0.9989
Model df: 6. Total lags used: 56

naives_m <- Arima(train_ts,
order=c(0, 0, 0),
seasonal=list(order=c(0,1,0), period=28))
sfit <- train_ts %>% snaive(m=28)
p1 <- naives_m %>% forecast(h=test_size) %>% autoplot(PI=FALSE)
p2 <- sfit %>% forecast(h=test_size) %>% autoplot()
grid.arrange(p1, p2, nrow=2, ncol=1)

sfit <- snaive(train_ts)
yhat_sfit <- sfit %>% forecast(h=test_size)
sfit %>% forecast(h=test_size) %>% autoplot()
Accuracy Measures
# SARIMA models
m1 <- Arima(train_ts,
order=c(1, 1, 0),
seasonal=list(order=c(0,1,1), period=28))
m2 <- Arima(train_ts,
order=c(4, 1, 0),
seasonal=list(order=c(0,1,1), period=28))
m3 <- Arima(train_ts,
order=c(4, 1, 0),
seasonal=list(order=c(0,1,4), period=28))
m4 <- Arima(train_ts,
order=c(3, 0, 2),
seasonal=list(order=c(2,1,0), period=13))
m5 <- Arima(train_ts,
order=c(4, 0, 0),
seasonal=list(order=c(2,1,0), period=28))
accuracy(yhat_m1, test_ts)
ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
Training set 7.162789 158.3700 98.86515 Inf Inf NaN -0.06150298 NA
Test set -183.437561 494.5975 411.62220 -67.0349 88.51759 NaN -0.34752397 0.9912245
accuracy(yhat_m2, test_ts)
ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
Training set 7.269319 142.9743 92.78463 -Inf Inf NaN 0.0640973 NA
Test set -110.032570 471.5929 388.80887 -51.40147 79.25787 NaN -0.3495840 0.9525737
accuracy(yhat_m3, test_ts)
ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
Training set 7.317047 138.2777 90.03849 -Inf Inf NaN 0.06230789 NA
Test set -90.257804 461.6869 380.91922 -47.90756 76.96151 NaN -0.34702807 0.920203
accuracy(yhat_m4, test_ts)
ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
Training set 5.363351 331.8169 243.259 -Inf Inf NaN -0.0003350836 NA
Test set -1.991453 409.3317 326.022 -31.38139 65.15798 NaN -0.0211198402 0.7923496
accuracy(yhat_m5, test_ts)
ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
Training set 6.122341 146.6422 89.46945 -Inf Inf NaN 0.007535442 NA
Test set 20.745266 446.7924 362.64753 -24.49447 65.80993 NaN -0.325192843 0.9155374
accuracy(yhat_naive, test_ts)
ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
Training set 9.250045 197.3485 112.4388 -Inf Inf NaN 0.3886022 NA
Test set 47.924370 445.6627 359.8908 -16.1501 63.4041 NaN -0.3310975 0.9683322
mae_tsglm28
[1] 90.35833
---
title: "Uber Trips Analysis"
output: html_notebook
---


To upload the data, download the `uber_pickups_lower_manhattan_wide_6h.csv` and stick it in a folder and remember the directory. 

```{r}
# Install required libraries if necessary.
require("forecast")
require("lubridate")
require("gridExtra")
require("tidyverse")
require("caret")
require("tscount")
```

```{r}
# Input data
uber_6h <- read_csv('../data/uber_pickups_lower_manhattan_wide_6h.csv') # Point this to the directory and file

uber_train <- uber_6h %>% filter(Pickup_date < ymd_hms("2015-06-01 00:00:00")) # This gives us a training set for all 8 locations
uber_test <- uber_6h %>% filter(Pickup_date >= ymd_hms("2015-06-01 00:00:00"))
uber_train <- uber_train %>% mutate(season=weekdays(Pickup_date, abbreviate=TRUE))
uber_test <- uber_test %>% mutate(season=weekdays(Pickup_date, abbreviate=TRUE))
# nrow(uber_6h) # if we want to check number of observations
# names(uber_6h) # use this command if you want to see the available time series
train_freq <- ts(uber_train$East_Village, frequency=28)
test_freq <- ts(uber_test$East_Village, frequency=28)
```

```{r}
head(uber_test)
```


## Train-test split

We will use the first five months as training data and predict Uber pickups for June 2015 in the East Village. The date is in positx format which can be parsed using the lubridate package. 

Statistical forecasting techniques are used on the East Village training data, and will be evaluated on East Village test data. 


```{r}
# This block selects only east village data and splits it into train/test sets
full_ts <- msts(uber_6h$East_Village,
                    start=decimal_date(ymd_hms("2015-01-01 00:00:00")),
                    seasonal.periods=c(4, 1461))

train_ts <- msts(uber_train$East_Village,
                    start=decimal_date(ymd_hms("2015-01-01 00:00:00")),
                    seasonal.periods=c(4, 1461))

test_ts <- msts(uber_test$East_Village,
                    start=decimal_date(ymd_hms("2015-06-01 00:00:00")),
                    seasonal.periods=c(4, 1461))


#train_ts <- window(full_ts, start=decimal_date(ymd_hms("2015-01-01 00:00:00"))) # Train with obsercations up until june

#test_ts <- window(full_ts, start=decimal_date(ymd_hms("2015-06-01 00:00:00"))) # Test out predictions with June data
y_test <- as.numeric(test_ts)
test_size <- length(y_test)
```


```{r}
# Show full data set
p1 <- full_ts %>% autoplot(series="All Trips") + 
  ggtitle("Full Dataset") +
  guides(colour=guide_legend("Data"))+
  scale_color_manual(values=c("black"))

# Show training dataset
p2 <- autoplot(train_ts, series="Training")+
  autolayer(test_ts,  series="Validation")+
  guides(colour=guide_legend("Split"))+
  scale_color_manual(values=c("black", "grey"))+
  ggtitle("Train/Test Split")

grid.arrange(p1, p2, nrow=2, ncol=1)
```

To look at any of the other 7 pickup locations, call `lil_italy <- uber_train %>% select(Little_Italy)` for example.

```{r}
# Show subset of data to look for patterns

jan_to_march <- train_ts %>% window(end=decimal_date(ymd_hms("2015-03-01 00:00:00")))

jan_to_march %>% autoplot()+
  ggtitle("2 Months of Uber Pickup Data")
```


By aggregating to 6-hour windows we model the demand per "shift" in a day.

- 00:00 - 05:59 - Graveyard Shift
- 06:00 - 11:59 - Morning Shift
- 12:00 - 17:59 - Afternoon Shift
- 18:00 - 23:59 - Evening Shift

## Moving Average Filter

```{r}
jan <- geom_vline(xintercept = decimal_date(ymd_hms("2015-01-01 00:00:00")), linetype="dashed", color = "blue", size=0.5)
feb <- geom_vline(xintercept = decimal_date(ymd_hms("2015-02-01 00:00:00")), linetype="dashed", color = "blue", size=0.5)
mar <- geom_vline(xintercept = decimal_date(ymd_hms("2015-03-01 00:00:00")), linetype="dashed", color = "blue", size=0.5)
apr <- geom_vline(xintercept = decimal_date(ymd_hms("2015-04-01 00:00:00")), linetype="dashed", color = "blue", size=0.5)
may <- geom_vline(xintercept = decimal_date(ymd_hms("2015-05-01 00:00:00")), linetype="dashed", color = "blue", size=0.5)
jun <- geom_vline(xintercept = decimal_date(ymd_hms("2015-06-01 00:00:00")), linetype="dashed", color = "blue", size=0.5)

train_ts %>% autoplot(series="Original Data")+
  autolayer(ma(train_ts, 4), series="Moving Average")+
  jan + feb + mar + apr + may + jun + # adding month lines
  guides(colour=guide_legend("Split"))+
  scale_color_manual(values=c("black", "grey"))+
  ggtitle("Moving Average of Training Data")

```

This helps with telling the seasonality of the data. It looks like there is a spike in demand at the beginning of each month. In addition there are about three spikes during each month. This probably corresponds to __weekly spikes in demand__. 

## Analyzing the trend

```{r}
lm(train_ts ~ time(train_ts)) %>% fitted() -> yhat.lm
autoplot(train_ts) + 
  geom_line(mapping=aes(x=time(train_ts), y=yhat.lm), color="red")+
  ggtitle("Linear Trend Fit")+
  xlab("Time")+
  ylab("Pickups")
```


There is a slight linear trend. Let's see what the ACF and PACF plots look like.


```{r}
p1 <- ggAcf(train_ts) + ggtitle("ACF of Uber Pickups")
p2 <- ggPacf(train_ts) + ggtitle("PACF of Uber Pickups")
grid.arrange(p1, p2, nrow=2, ncol=1)
```



Not very informative although it initially hints toward an AR(p) model after a difference. 

Before checking the lag plot, the trend should be removed via differencing.

## Differencing 

```{r}
# code for differencing the training data
train_diff <- diff(train_ts)
lm(train_diff ~ time(train_diff)) %>% fitted() -> yhat.lm

train_diff %>% autoplot() + 
  geom_line(mapping=aes(x=time(train_diff), y=yhat.lm), color="red") + 
  ggtitle("Differenced Pickup Data") + 
  ylab("Change in Pickups")
```

```{r}
# Comparison of time series plots

lm(train_ts ~ time(train_ts)) %>% fitted() -> yhat.lm1
p1 <- autoplot(train_ts) + 
  geom_line(mapping=aes(x=time(train_ts), y=yhat.lm1), color="red")+
  ggtitle("Original Data with Linear Fit")+
  xlab("Time")+
  ylab("Pickups")

# code for differencing the training data
train_diff <- diff(train_ts)
lm(train_diff ~ time(train_diff)) %>% fitted() -> yhat.lm2

p2 <- train_diff %>% autoplot() + 
  geom_line(mapping=aes(x=time(train_diff), y=yhat.lm2), color="red") + 
  ggtitle("Differenced Data with Linear Fit") + 
  ylab("Change in Pickups")

grid.arrange(p1, p2, nrow=1, ncol=2)
```



## Zoom in

```{r}
diffwin <- train_diff %>% window(end=decimal_date(ymd_hms("2015-03-01 00:00:00"))) 
diffwin %>% autoplot() + ggtitle("2 Months of Differenced Pickups")
```

```{r}

diffwin %>% autoplot(series="Original Data")+
  autolayer(ma(diffwin, 28), series="Moving Average")+
  jan + feb + mar +
  guides(colour=guide_legend("Split"))+
  scale_color_manual(values=c("black", "grey"))+
  ggtitle("Moving Average of Training Data (Jan and Feb)")
```

```{r}
p1 <- train_diff  %>% ggAcf() + ggtitle("ACF of Differenced Uber Pickups")
p2 <- train_diff %>% ggPacf() + ggtitle("PACF of Differenced Uber Pickups")
grid.arrange(p1, p2, nrow=1, ncol=2)
```


```{r}
# ACF of differenced data
train_diff %>% Acf(plot=FALSE, lag.max=30)
```

```{r}
# PACF of differenced data
train_diff %>% Pacf(plot=FALSE, lag.max=30)
```


There is clear seasonality and a sharp cutoff after lag 26-ish. Each lag is 1 shift, so 24 shifts would be 6 days of pickups, 26 shifts would be 6.5 days of pickups (evening shift of the 6th day). 

In the ACF plot there is a spike at lag 4, and another lower spike at lag 8, lag 12, etc. In addition, there are large spikes at lag 28n. This indicates an AR(4) component and seasonal component at lag 28. The PACF spikes and cuts off after lag 3, and spikes again at lag 28. This further suggests an AR(4) or AR(3) model.

```{r}
gglagplot(train_diff, set.lags= c(4, 8, 12, 28, 56, 84), do.lines=FALSE, colour=FALSE) + ggtitle("Autocorrelation for different lags: Train Diff")
```

We looked at the lags for the differenced time series. We suspect that there is a positive autocorrelation at lag 4n because of assumed daily seasonality. The other place to check would be where the ACF's seasonal decay exhibited spikes at lag 28n. Looking at the plot above it's clear that this is a highly predictive lag.

Next we difference the series S=28 times for the D=1 seasonal difference. Here we can see what the order of the seasonal component would be.

```{r}
train_sdiff <- train_ts %>% diff() %>% diff(lag=28)

p1 <- train_sdiff %>% ggAcf() + ggtitle("ACF of Diff(28) data")
p2 <- train_sdiff %>% ggPacf() + ggtitle("PACF of Diff(28) data")
grid.arrange(p1, p2, nrow=2, ncol=1)
```


```{r}
# ACF of seasonal component
train_ts %>% diff() %>% diff(lag=28) %>% Acf(plot=FALSE, lag.max=30) 
```

```{r}
# PACF of seasonal component
train_ts %>% diff() %>% diff(lag=28) %>% Pacf(plot=FALSE, lag.max=30) 
```

The ACF and PACF of the seasonal differenced data look very similar. There is a spike at lag 1 and at lag 28 (ACF) and lag 29 (PACF). Besides these two spikes, the rest of the values tend to stick around zero. This is indicative of an SMA(1) seasonal component. There doesn't appear to be any SAR(P) component. Since we took one seasonal difference, then D=1.

We will now __make an ARIMA model based on the ACF and PACF plots of the diff(1) and diff(28)__ version of our original data. 


## SARIMA Models


The first model is based on the model presented in the time series analysis book mixed with the seasonality that we observe in the Uber dataset. This is a model that is used in economics and was used to predict airline passengers in the book. 

```{r}
# Intuitive model based on the plots
m1 <- Arima(train_ts, 
      order=c(1, 1, 0), 
      seasonal=list(order=c(0,1,1), period=28))

yhat <- m1 %>% forecast(h=test_size)

# Plot the predictions
p1 <- yhat %>% autoplot() 
p2 <- ggAcf(residuals(m1)) + ggtitle("ACF of m1 Residuals")

grid.arrange(p1, p2, nrow=1, ncol=2)

# Print out summary of the coefficients
m1 %>% summary()

m1_y <- m1 %>% forecast(h=test_size)
```

The pattern looks like it was captured in the predictive mean of the forecast, but the prediction intervals get really wide for forecasts beyond about 2 weeks. At this point, the prediction interval captures negative values. 

Next we use p=4 since this is what was uncovered in the ACF and PACF plots of the differenced time series.


```{r}
m2 <- Arima(train_ts, 
      order=c(4, 1, 0), 
      seasonal=list(order=c(0,1,1), period=28))


m2_y <- m2 %>% forecast(h=test_size)

p1 <- yhat2 %>% autoplot()
p2 <- ggAcf(residuals(m2)) + ggtitle("ACF of Residuals")

grid.arrange(p1, p2, nrow=1, ncol=2)
```

This second model dips below zero in the prediction interval after the first week. The AIC and BIC of this model are also lower than that of the first economic model.

The last model is one that is chosen by `auto.arima` on the basis of AIC. However, the function needs a time series with frequency defined and I struggled to understand the frequency parameter. I made a guess that it would be the number of days in a year (1 season) divided by the value of the season present in my data.

```{r}
# Auto arima with forced seasonality
train_seasonal <- ts(uber_train$East_Village, frequency=365.4/28)
m_auto <- auto.arima(train_seasonal, D=1) 
m_auto %>% forecast(h=test_size) %>% autoplot()
m_auto_y <- m_auto %>% forecast(h=test_size)
```

```{r}
m_auto <- Arima(train_ts, 
      order=c(3, 0, 2), 
      seasonal=list(order=c(2,1,0), period=13))

m_auto_y <- m_auto %>% forecast(h=test_size)
```

```{r}
sfit <- snaive(train_ts) 
yhat_sfit <- sfit %>% forecast(h=test_size)
sfit %>% forecast(h=test_size) %>% autoplot()
```



This model does worse than the first two. 

```{r}
# Comparison of model results
p1 <- ggplot()+
  geom_point(mapping=aes(x=y_test, y=yhat$mean))+
  xlim(min(y_test), max(y_test))+
  ylim(min(y_test), max(y_test))+
  ggtitle("Actual vs. Predicted (m1)")

p2 <- ggplot()+
  geom_point(mapping=aes(x=y_test, y=yhat2$mean))+
  xlim(min(y_test), max(y_test))+
  ylim(min(y_test), max(y_test))+
  ggtitle("Actual vs. Predicted (m2)")

grid.arrange(p1, p2, nrow=1, ncol=2)
```


```{r}
m7 <- Arima(train_ts, 
      order=c(4, 1, 0), 
      seasonal=list(order=c(0,1,4), period=28))

m7_y <- m7 %>% forecast(h=test_size)

p1 <- yhat7 %>% autoplot()
p2 <- ggAcf(residuals(m7)) + ggtitle("ACF of Residuals")

grid.arrange(p1, p2, nrow=1, ncol=2)
```

```{r}
BIC(m2)
BIC(m7)
```

```{r}
m8 <- Arima(train_ts, 
      order=c(3, 1, 0), 
      seasonal=list(order=c(0,1,1), period=28))

yhat8 <- m8 %>% forecast(h=test_size)

p1 <- yhat8 %>% autoplot()
p2 <- ggAcf(residuals(m8)) + ggtitle("ACF of Residuals")

grid.arrange(p1, p2, nrow=1, ncol=2)
```

### Ljung-Box Tests

```{r}
# Ljung Box tests
fit_test <- sarima(train_ts, p=4, d=1, q=0, P=0, D=1,  Q=1, S=28)

fit_test
```

`m1` was pretty good but didn't pass the Ljung-Box test of significance. However `m2` did. And this also assumes normally-distributed errors which isn't exactly true with our data since it's Poisson distributed. 


## Poisson Models

```{r}
# One-hot encoding function for weekly seasonal variable
dmy <- dummyVars(~season, data = uber_train)
trainCovariates <- data.frame(predict(dmy ,newdata=data.frame(season=uber_train$season)))
testCovariates <- data.frame(predict(dmy ,newdata=data.frame(season=uber_test$season)))

# 
m3 <- tsglm(train_ts, model=list(past_obs=1, past_mean=1), distr="poisson", xreg=trainCovariates)

yhat_m3 <- predict(m3, n.ahead=test_size, newxreg=testCovariates)

predint_m3 <- data.frame(yhat_m3$interval)

ggplot()+
  geom_line(mapping=aes(x=time(train_ts), y=train_ts))+
  geom_ribbon(mapping=aes(x=time(test_ts), ymin=predint_m3$lower, ymax = predint_m3$upper), fill="blue", alpha=0.5)+
  geom_line(mapping=aes(x=time(test_ts), y=yhat_m3$median), color="blue")+
  ggtitle("Poisson GLM with ARMA(1,1) and Weekday Dummy Variable")
```

```{r}
m4 <- tsglm(train_ts, model=list(past_obs=1, past_mean=1), distr="poisson")

yhat_m4 <- predict(m4, n.ahead=test_size)

predint_m4 <- data.frame(yhat_m4$interval)

ggplot()+
  geom_line(mapping=aes(x=time(train_ts), y=train_ts))+
  geom_ribbon(mapping=aes(x=time(test_ts), ymin=predint_m4$lower, ymax = predint_m4$upper), fill="blue", alpha=0.5)+
  geom_line(mapping=aes(x=time(test_ts), y=yhat_m4$median), color="blue")+
  ggtitle("Poisson GLM with ARMA(1,1)")
```

```{r}

m5 <- tsglm(train_ts, model=list(past_obs=28, past_mean=1), distr="poisson")

yhat_m5 <- predict(m5, n.ahead=test_size)

predint_m5 <- data.frame(yhat_m5$interval)

mean((y_test - yhat_m5$median)^2)

ggplot()+
  geom_line(mapping=aes(x=time(train_ts), y=train_ts))+
  geom_ribbon(mapping=aes(x=time(test_ts), ymin=predint_m5$lower, ymax = predint_m5$upper), fill="blue", alpha=0.5)+
  geom_line(mapping=aes(x=time(test_ts), y=yhat_m5$median), color="blue")+
  ggtitle("Poisson GLM with ARMA(28,1)")

m13 <- tsglm(train_ts, model=list(past_obs=28, past_mean=28), distr="poisson")

yhat_m6 <- predict(m13, n.ahead=test_size)

predint_m6 <- data.frame(yhat_m6$interval)

mean((y_test - yhat_m6$median)^2)

ggplot()+
  geom_line(mapping=aes(x=time(train_ts), y=train_ts))+
  geom_ribbon(mapping=aes(x=time(test_ts), ymin=predint_m6$lower, ymax = predint_m6$upper), fill="blue", alpha=0.5)+
  geom_line(mapping=aes(x=time(test_ts), y=yhat_m6$median), color="blue")+
  ggtitle("Poisson GLM with ARMA(28,28)")
```

```{r}
dmy <- dummyVars(~season, data = uber_train)
trainCovariates <- data.frame(predict(dmy ,newdata=data.frame(season=uber_train$season)))
testCovariates <- data.frame(predict(dmy ,newdata=data.frame(season=uber_test$season)))

m6 <- tsglm(train_ts, model=list(past_obs=4, past_mean=1), distr="poisson", xreg=trainCovariates)
predint_m6 <- data.frame(yhat_m6$interval)

yhat_m6 <- predict(m6, n.ahead=test_size, newxreg=testCovariates)

ggplot()+
  geom_line(mapping=aes(x=time(train_ts), y=train_ts))+
  geom_ribbon(mapping=aes(x=time(test_ts), ymin=predint_m6$lower, ymax = predint_m6$upper), fill="blue", alpha=0.5)+
  geom_line(mapping=aes(x=time(test_ts), y=yhat_m6$median), color="blue")+
  ggtitle("Poisson GLM ARMA(4,1) and Weekday Dummy Variable")
```


That works, should I include the AIC / BIC tables up here too? And the residuals plots and stuff? Alrighty, if you could just put an example section with your model and I can just copy the same format. I have plenty of BS I can stick in d:

Jk im gonna delete this so we have space haha but I'll copy it in a separate texct file
Could you include a quick writeup on it? I was gonna write up something quick about SARIMA, and William is gonna write up something about Poisson. Sounds good, would you be down to put it up here? We were trying to think of what to put up in this section (unless you have an idea of what to put . Sounds good


Our analysis considered three models:\\
thats right, and multivariate generalized additive models, or VGAM's.Sure, ill do a write up in the model fitting section, ill type out the actual models mathematically and try to explain them. yea i can write it here instead, for model fitting and validation put plots of your forecast beside the validation and talk about how well they fit the data. also probably include tables with the models and errors on the validation set. Yes, and the ljung box stuff too, the procedures you used to choose the best model should be there like the aic, but lets not go too crazy lol. I think at most we include validation error, aic, residual or Q-Q plots, and ljung box should be good. for sure, alright have fun lol

```{r}
train_freq <- ts(uber_train$East_Village, frequency=28)
test_freq <- ts(uber_test$East_Village, frequency=28)

train_freq %>% auto.arima() -> m10

p1 <- m10 %>% forecast(h=120) %>% autoplot()
p2 <- m10 %>% residuals() %>% ggAcf() + ggtitle("ACF of Residuals")

grid.arrange(p1, p2, nrow=1, ncol=2)
```

```{r}
checkresiduals(m10)
```

```{r}
naives_m <- Arima(train_ts, 
      order=c(0, 0, 0), 
      seasonal=list(order=c(0,1,0), period=28))

sfit <- train_ts %>% snaive(m=28) 
p1 <- naives_m %>% forecast(h=test_size) %>% autoplot(PI=FALSE)
p2 <- sfit %>% forecast(h=test_size) %>% autoplot()

grid.arrange(p1, p2, nrow=2, ncol=1)
```

```{r}
sfit <- snaive(train_ts) 
yhat_sfit <- sfit %>% forecast(h=test_size)
sfit %>% forecast(h=test_size) %>% autoplot()
```

## Accuracy Measures

```{r}
# SARIMA models
m1 <- Arima(train_ts, 
      order=c(1, 1, 0), 
      seasonal=list(order=c(0,1,1), period=28))

m2 <- Arima(train_ts, 
      order=c(4, 1, 0), 
      seasonal=list(order=c(0,1,1), period=28))

m3 <- Arima(train_ts, 
      order=c(4, 1, 0), 
      seasonal=list(order=c(0,1,4), period=28))

m4 <- Arima(train_ts, 
      order=c(3, 0, 2), 
      seasonal=list(order=c(2,1,0), period=13))

m5 <- Arima(train_ts, 
      order=c(4, 0, 0), 
      seasonal=list(order=c(2,1,0), period=28))

s_naive <- Arima(train_ts, 
      order=c(0, 0, 0), 
      seasonal=list(order=c(0,1,0), period=28))


yhat_m1 <- m1 %>% forecast(h=test_size)
yhat_m2 <- m2 %>% forecast(h=test_size)
yhat_m3 <- m3 %>% forecast(h=test_size)
yhat_m4 <- m4 %>% forecast(h=test_size)
yhat_m5 <- m5 %>% forecast(h=test_size)
yhat_naive <- s_naive %>% forecast(h=test_size)
```


```{r}
accuracy(yhat_m1, test_ts)
accuracy(yhat_m2, test_ts)
accuracy(yhat_m3, test_ts)
accuracy(yhat_m4, test_ts)
accuracy(yhat_m5, test_ts)
accuracy(yhat_naive, test_ts)
```

```{r}
# 
mse_tsglm1 <- mean((y_test - yhat_m5$median)^2)
mse_tsglm28 <- mean((y_test - yhat_m6$median)^2)

mae_tsglm1 <- mean(abs(y_test - yhat_m5$median))
mae_tsglm28 <- mean(abs(y_test - yhat_m6$median))

mse_tsglm1
mse_tsglm28
mae_tsglm1
mae_tsglm28

```

```{r}



```

